Contents: NLP Project - 2
# Import all the relevant libraries needed to complete the analysis, visualization, modeling and presentation
import pandas as pd
import numpy as np
import os
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline
sns.set_style('darkgrid')
from scipy import stats
from scipy.stats import zscore
from sklearn import preprocessing
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import RobustScaler
from sklearn import model_selection
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn.model_selection import RandomizedSearchCV, StratifiedKFold, KFold
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
from sklearn.metrics import ConfusionMatrixDisplay, precision_score, recall_score
from sklearn.metrics import precision_recall_curve, roc_curve, auc, roc_auc_score
from sklearn.metrics import plot_precision_recall_curve, average_precision_score
from sklearn.metrics import f1_score, plot_roc_curve
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LinearRegression, LogisticRegression, RidgeClassifier, SGDClassifier
from sklearn.naive_bayes import GaussianNB, BernoulliNB, MultinomialNB
from sklearn.svm import SVC
# from sklearn.decomposition import PCA
# from scipy.cluster.hierarchy import dendrogram, linkage
# from scipy.cluster.hierarchy import fcluster
# from sklearn.cluster import KMeans
# from sklearn.metrics import silhouette_samples, silhouette_score
# import xgboost as xgb
# from xgboost import plot_importance
# from lightgbm import LGBMClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import BaggingClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.ensemble import VotingClassifier
from imblearn.over_sampling import RandomOverSampler
from imblearn.over_sampling import SMOTENC, SMOTE, ADASYN
from imblearn.under_sampling import RandomUnderSampler
import re
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import CountVectorizer
from nltk.stem.snowball import SnowballStemmer
import pandas_profiling as pp
import gensim
import logging
# import cv2
# from google.colab.patches import cv2_imshow
# from glob import glob
# import itertools
import tensorflow as tf
from tensorflow.keras.models import Sequential, Model
from tensorflow.keras.layers import Dense, Dropout, Flatten, Conv2D, MaxPool2D
from tensorflow.keras.layers import Activation, GlobalMaxPool2D, GlobalAveragePooling2D
from tensorflow.keras.layers import UpSampling2D, Input, Concatenate
from tensorflow.keras.layers import BatchNormalization, LeakyReLU
from tensorflow.keras.optimizers import Adam, RMSprop, SGD, Adagrad
from tensorflow.keras.applications import MobileNetV2
from tensorflow.keras.callbacks import EarlyStopping, ReduceLROnPlateau, ModelCheckpoint
from tensorflow.keras.metrics import Recall, Precision
from tensorflow.keras import backend as K
from tensorflow import keras
from keras.utils.np_utils import to_categorical
from keras.utils import np_utils
from tensorflow.keras.preprocessing.image import ImageDataGenerator
from tensorflow.keras.wrappers.scikit_learn import KerasClassifier, KerasRegressor
import warnings
warnings.filterwarnings("ignore")
import random
from zipfile import ZipFile
# Set random_state
random_state = 42
# pd.set_option('display.max_columns', None)
# pd.set_option('display.max_rows', None)
from google.colab import drive
drive.mount('/content/drive')
# Current working directory
%cd "/content/drive/MyDrive/MGL/Project-NLP-2/"
# # List all the files in a directory
# for dirname, _, filenames in os.walk('path'):
# for filename in filenames:
# print(os.path.join(dirname, filename))
# List files in the directory
!ls
imdb.load_data() method# # Path of the data file
# path = 'IMDB Dataset.csv.zip'
# # Unzip files in the current directory
# with ZipFile (path,'r') as z:
# z.extractall()
# print("Training zip extraction done!")
# Import the dataset
df = pd.read_csv('IMDB Dataset.csv')
df.shape
df.info()
# pd.set_option('display.max_colwidth', None)
df.head()
# Clear the matplotlib plotting backend
%matplotlib inline
plt.close('all')
# Understand the 'sentiment' the target vector
f,axes=plt.subplots(1,2,figsize=(17,7))
df['sentiment'].value_counts().plot.pie(autopct='%1.1f%%',ax=axes[0])
sns.countplot('sentiment',data=df,ax=axes[1])
axes[0].set_title('Pie Chart for sentiment')
axes[1].set_title('Bar Graph for sentiment')
plt.show()
The dataset consists of two groups:
Its evident that the dataset is very well balanced. This is a very favourable situation for a classification task.
# Visualize word cloud of random positive and negative review
# Choose randomly a positive review and a negative review
ind_positive = random.choice(list(df[df['sentiment'] == 'positive'].index))
ind_negative = random.choice(list(df[df['sentiment'] == 'negative'].index))
review_positive = df['review'][ind_positive]
review_negative = df['review'][ind_negative]
print('Positive review: ', review_positive)
print('\n')
print('Negative review: ', review_negative)
print('\n')
from wordcloud import WordCloud
cloud_positive = WordCloud().generate(review_positive)
cloud_negative = WordCloud().generate(review_negative)
plt.figure(figsize = (20,15))
plt.subplot(1,2,1)
plt.imshow(cloud_positive)
plt.title('Positive review')
plt.subplot(1,2,2)
plt.imshow(cloud_negative)
plt.title('Negative review')
plt.show()
# Text Cleaning
import re
def remove_url(text):
url_tag = re.compile(r'https://\S+|www\.\S+')
text = url_tag.sub(r'', text)
return text
def remove_html(text):
html_tag = re.compile(r'<.*?>')
text = html_tag.sub(r'', text)
return text
def remove_punctuation(text):
punct_tag = re.compile(r'[^\w\s]')
text = punct_tag.sub(r'', text)
return text
def remove_special_character(text):
special_tag = re.compile(r'[^a-zA-Z0-9\s]')
text = special_tag.sub(r'', text)
return text
def remove_emojis(text):
emoji_pattern = re.compile("["
u"\U0001F600-\U0001F64F" # emoticons
u"\U0001F300-\U0001F5FF" # symbols & pictographs
u"\U0001F680-\U0001F6FF" # transport & map symbols
u"\U0001F1E0-\U0001F1FF" # flags (iOS)
"]+", flags=re.UNICODE)
text = emoji_pattern.sub(r'', text)
return text
def clean_text(text):
text = remove_url(text)
text = remove_html(text)
text = remove_punctuation(text)
text = remove_special_character(text)
text = remove_emojis(text)
text = text.lower()
return text
df['processed'] = df['review'].apply(lambda x: clean_text(x))
df['label'] = df['sentiment'].apply(lambda x: 0 if x == 'negative' else 1)
df.head()
# df = df.sample(n=1000, random_state = 0)
# Create the features matrix and target vector
df1=df[['processed', 'label']]
df1.head()
# Split the data for training and testing
# To be used in the transformers (BERT)
train, test = train_test_split(df1, test_size=0.5, random_state=0)
# Loading the IMDB dataset
# The argument num_words=10000 keeps the top 10,000 most frequently occurring words in the training data.
# The rare words are discarded to keep the size of the data manageable.
top_words = 10000
(X_train, y_train), (X_test, y_test) = tf.keras.datasets.imdb.load_data(path="imdb.npz",
num_words=top_words)
X_train
y_train
Let's take a moment to understand the format of the data. The dataset comes preprocessed: each example is an array of integers representing the words of the movie review. Each label is an integer value of either 0 or 1, where 0 is a negative review, and 1 is a positive review.
# Shape of training data
print("X_train: {}, y_train: {}".format(len(X_train),len(y_train)))
# Shape of test data
print("X_test: {}, y_test: {}".format(len(X_test),len(y_test)))
# The text of reviews have been converted to integers, where each integer represents a specific word in a dictionary.
# Looking at the first review
print(X_train[0])
print(y_train[0])
# Movie reviews may be different lengths. The below code shows the number of words in the first and second reviews.
# Since inputs to a NN/RNN must be the same length, we'll need to resolve this later.
len(X_train[0]), len(X_train[1])
# Convert integers back to text: Here, we'll create a helper function to query a dictionary object that contains the integer to string mapping:
# A dictionary mapping words to an integer index
imdb = keras.datasets.imdb
word_index = imdb.get_word_index()
# The first indices are reserved
word_index = {k:(v+3) for k,v in word_index.items()}
word_index["<PAD>"] =0
word_index["<START>"]=1
word_index["<UNK>"]=2 #unknown
word_index["<UNUSED>"] = 3
reverse_word_index = dict([(value, key) for (key, value) in word_index.items()])
def decode_review(text):
return ' '.join([reverse_word_index.get(i,'?') for i in text])
decode_review(X_train[0])
decode_review(X_train[1])
The reviews (intteger arrays) must be converted to tensors before fed into the neural network. This conversion can be done in many ways:
# pad_sequences is used to ensure that all sequences in a list have the same length. By default this is done by padding 0 in the beginning
# of each sequence until each sequence has the same length as the longest sequence.
#Since the sequences have different lengtht, then we use padding method to put all sequences to the same length.
#The parameter "maxlen" sets the maximum length of the output sequence.
# If length of the input sequence is larger than "maxlen", then it is trunced to keep only #maxlen words, (truncating = 'pre': keep the
# previous part of the sequence; truncating = 'post': keep the posterior part of the sequence).
# If length of the input sequence is smaller than "maxlen", then 0 elements will be padded into the previous part of sequence
# (if padding = 'pre' - by defaut) or in the tail of the sequence (if padding = 'post').
max_length = 256
trunc_type = 'post'
X_train = keras.preprocessing.sequence.pad_sequences(X_train, value=word_index["<PAD>"],padding="post",maxlen = max_length, truncating = trunc_type)
X_test = keras.preprocessing.sequence.pad_sequences(X_test, value=word_index["<PAD>"],padding="post",maxlen = max_length, truncating = trunc_type)
# Check the length of reviews again
len(X_train[0]), len(X_train[1])
# Check the first review after padding
X_train[0]
Hint: The aim here Is to import the text, process it such a way that it can be taken as an inout to the ML/NN classifiers. Be analytical and experimental here in trying new approaches to design the best model.
# Input shape is the vocabulary count used for the movie reviews (10,000 words)
vocab_size = 10000
embedding_dim = 16
max_length = 256
model = keras.Sequential()
model.add(keras.layers.Embedding(vocab_size, embedding_dim, input_length = max_length))
model.add(keras.layers.GlobalAveragePooling1D())
model.add(keras.layers.Dense(16, activation=tf.nn.relu))
model.add(keras.layers.Dense(1, activation=tf.nn.sigmoid))
model.summary()
model.compile(loss = 'binary_crossentropy', optimizer = 'adam', metrics = ['accuracy'])
H = model.fit(X_train, y_train, epochs = 10, batch_size = 128, validation_data = (X_test, y_test))
plt.figure(figsize = (12,5))
plt.subplot(1,2,1)
plt.plot(H.history['accuracy'], label = 'Train')
plt.plot(H.history['val_accuracy'], label = 'Validation')
plt.legend()
plt.title('Accuracy')
plt.subplot(1,2,2)
plt.plot(H.history['loss'], label = 'Train')
plt.plot(H.history['val_loss'], label = 'Validation')
plt.legend()
plt.title('Loss')
plt.show()
y_pred_proba = model.predict(X_test)
y_pred = np.array([0 if proba < 0.5 else 1 for proba in y_pred_proba])
# Classification Accuracy
print("Classification Accuracy:")
print('Loss and Accuracy on Training data:',model.evaluate(X_train, y_train))
print('Loss and Accuracy on Test data:',model.evaluate(X_test, y_test))
print()
# Classification Report
print("Classification Report:\n",classification_report(y_test, y_pred))
# Confusion Matrix
print("Confusion Matrix Chart:")
cm = confusion_matrix(y_test, y_pred)
df_cm = pd.DataFrame(cm, index = [i for i in ['0', '1']],
columns = [i for i in ['0', '1']])
plt.figure(figsize = (12,10))
sns.heatmap(df_cm, annot=True, fmt='g')
plt.title('Confusion Matrix')
plt.ylabel('Actual Label')
plt.xlabel('Predicted Label')
plt.show()
# Model comparison
precision = precision_score(y_test,y_pred, average='macro')
recall = recall_score(y_test,y_pred, average='macro')
f1 = f1_score(y_test,y_pred, average='macro')
Train_Accuracy = model.evaluate(X_train, y_train)
Test_Accuracy = model.evaluate(X_test, y_test)
base_1 = []
base_1.append(['ANN', Train_Accuracy[1], Test_Accuracy[1], precision, recall, f1])
model_comparison = pd.DataFrame(base_1,columns=['Model','Train Accuracy','Test Accuracy','Precision','Recall','F1 Score'])
model_comparison.sort_values(by=['Recall','F1 Score'], inplace=True, ascending=False)
# An approach for predicted and actual labels
for i in range(5):
print(decode_review(X_test[i]))
pred = model.predict(X_test[i].reshape(1, 256))
print('Prediction prob = ', pred, '\t Actual =', y_test[i])
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, Conv1D, AveragePooling1D, Bidirectional, LSTM, SimpleRNN, Dense
vocab_size = 10000
embedding_dim = 16
max_length = 256
model = Sequential()
model.add(Embedding(vocab_size, embedding_dim, input_length = max_length))
model.add(Conv1D(filters = 32, kernel_size = 3, padding = 'same', activation = 'relu'))
model.add(AveragePooling1D(pool_size = 2))
# model.add(Bidirectional(SimpleRNN(32, dropout = 0.5)))
model.add((SimpleRNN(32, dropout = 0.5)))
model.add(Dense(1, activation = 'sigmoid'))
model.compile(loss = 'binary_crossentropy', optimizer = 'adam', metrics = ['accuracy'])
H = model.fit(X_train, y_train, epochs = 10, batch_size = 128, validation_data = (X_test, y_test), verbose=1)
plt.figure(figsize = (12,5))
plt.subplot(1,2,1)
plt.plot(H.history['accuracy'], label = 'Train')
plt.plot(H.history['val_accuracy'], label = 'Validation')
plt.legend()
plt.title('Accuracy')
plt.subplot(1,2,2)
plt.plot(H.history['loss'], label = 'Train')
plt.plot(H.history['val_loss'], label = 'Validation')
plt.legend()
plt.title('Loss')
plt.show()
y_pred_proba = model.predict(X_test)
y_pred = np.array([0 if proba < 0.5 else 1 for proba in y_pred_proba])
# Classification Accuracy
print("Classification Accuracy:")
print('Loss and Accuracy on Training data:',model.evaluate(X_train, y_train))
print('Loss and Accuracy on Test data:',model.evaluate(X_test, y_test))
print()
# Classification Report
print("Classification Report:\n",classification_report(y_test, y_pred))
# Confusion Matrix
print("Confusion Matrix Chart:")
cm = confusion_matrix(y_test, y_pred)
df_cm = pd.DataFrame(cm, index = [i for i in ['0', '1']],
columns = [i for i in ['0', '1']])
plt.figure(figsize = (12,10))
sns.heatmap(df_cm, annot=True, fmt='g')
plt.title('Confusion Matrix')
plt.ylabel('Actual Label')
plt.xlabel('Predicted Label')
plt.show()
# Model comparison
precision = precision_score(y_test,y_pred, average='macro')
recall = recall_score(y_test,y_pred, average='macro')
f1 = f1_score(y_test,y_pred, average='macro')
Train_Accuracy = model.evaluate(X_train, y_train)
Test_Accuracy = model.evaluate(X_test, y_test)
# base_1 = []
base_1.append(['RNN', Train_Accuracy[1], Test_Accuracy[1], precision, recall, f1])
model_comparison = pd.DataFrame(base_1,columns=['Model','Train Accuracy','Test Accuracy','Precision','Recall','F1 Score'])
model_comparison.sort_values(by=['Recall','F1 Score'], inplace=True, ascending=False)
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, Conv1D, AveragePooling1D, Bidirectional, LSTM, SimpleRNN, GRU, Dense
vocab_size = 10000
embedding_dim = 16
max_length = 256
model = Sequential()
model.add(Embedding(vocab_size, embedding_dim, input_length = max_length))
model.add(Conv1D(filters = 32, kernel_size = 3, padding = 'same', activation = 'relu'))
model.add(AveragePooling1D(pool_size = 2))
# model.add(Bidirectional(GRU(32, dropout = 0.5)))
model.add((GRU(32, dropout = 0.5)))
model.add(Dense(1, activation = 'sigmoid'))
model.compile(loss = 'binary_crossentropy', optimizer = 'adam', metrics = ['accuracy'])
H = model.fit(X_train, y_train, epochs = 10, batch_size = 128, validation_data = (X_test, y_test), verbose=1)
plt.figure(figsize = (12,5))
plt.subplot(1,2,1)
plt.plot(H.history['accuracy'], label = 'Train')
plt.plot(H.history['val_accuracy'], label = 'Validation')
plt.legend()
plt.title('Accuracy')
plt.subplot(1,2,2)
plt.plot(H.history['loss'], label = 'Train')
plt.plot(H.history['val_loss'], label = 'Validation')
plt.legend()
plt.title('Loss')
plt.show()
y_pred_proba = model.predict(X_test)
y_pred = np.array([0 if proba < 0.5 else 1 for proba in y_pred_proba])
# Classification Accuracy
print("Classification Accuracy:")
print('Loss and Accuracy on Training data:',model.evaluate(X_train, y_train))
print('Loss and Accuracy on Test data:',model.evaluate(X_test, y_test))
print()
# Classification Report
print("Classification Report:\n",classification_report(y_test, y_pred))
# Confusion Matrix
print("Confusion Matrix Chart:")
cm = confusion_matrix(y_test, y_pred)
df_cm = pd.DataFrame(cm, index = [i for i in ['0', '1']],
columns = [i for i in ['0', '1']])
plt.figure(figsize = (12,10))
sns.heatmap(df_cm, annot=True, fmt='g')
plt.title('Confusion Matrix')
plt.ylabel('Actual Label')
plt.xlabel('Predicted Label')
plt.show()
# Model comparison
precision = precision_score(y_test,y_pred, average='macro')
recall = recall_score(y_test,y_pred, average='macro')
f1 = f1_score(y_test,y_pred, average='macro')
Train_Accuracy = model.evaluate(X_train, y_train)
Test_Accuracy = model.evaluate(X_test, y_test)
# base_1 = []
base_1.append(['GRU', Train_Accuracy[1], Test_Accuracy[1], precision, recall, f1])
model_comparison = pd.DataFrame(base_1,columns=['Model','Train Accuracy','Test Accuracy','Precision','Recall','F1 Score'])
model_comparison.sort_values(by=['Recall','F1 Score'], inplace=True, ascending=False)
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, Conv1D, AveragePooling1D, Bidirectional, LSTM, Dense
vocab_size = 10000
embedding_dim = 16
max_length = 256
model = Sequential()
model.add(Embedding(vocab_size, embedding_dim, input_length = max_length))
model.add(Conv1D(filters = 32, kernel_size = 3, padding = 'same', activation = 'relu'))
model.add(AveragePooling1D(pool_size = 2))
# model.add(Bidirectional(LSTM(32, dropout = 0.5)))
model.add((LSTM(32, dropout = 0.5)))
model.add(Dense(1, activation = 'sigmoid'))
model.compile(loss = 'binary_crossentropy', optimizer = 'adam', metrics = ['accuracy'])
H = model.fit(X_train, y_train, epochs = 10, batch_size = 128, validation_data = (X_test, y_test), verbose=1)
plt.figure(figsize = (12,5))
plt.subplot(1,2,1)
plt.plot(H.history['accuracy'], label = 'Train')
plt.plot(H.history['val_accuracy'], label = 'Validation')
plt.legend()
plt.title('Accuracy')
plt.subplot(1,2,2)
plt.plot(H.history['loss'], label = 'Train')
plt.plot(H.history['val_loss'], label = 'Validation')
plt.legend()
plt.title('Loss')
plt.show()
y_pred_proba = model.predict(X_test)
y_pred = np.array([0 if proba < 0.5 else 1 for proba in y_pred_proba])
# Classification Accuracy
print("Classification Accuracy:")
print('Loss and Accuracy on Training data:',model.evaluate(X_train, y_train))
print('Loss and Accuracy on Test data:',model.evaluate(X_test, y_test))
print()
# Classification Report
print("Classification Report:\n",classification_report(y_test, y_pred))
# Confusion Matrix
print("Confusion Matrix Chart:")
cm = confusion_matrix(y_test, y_pred)
df_cm = pd.DataFrame(cm, index = [i for i in ['0', '1']],
columns = [i for i in ['0', '1']])
plt.figure(figsize = (12,10))
sns.heatmap(df_cm, annot=True, fmt='g')
plt.title('Confusion Matrix')
plt.ylabel('Actual Label')
plt.xlabel('Predicted Label')
plt.show()
# Model comparison
precision = precision_score(y_test,y_pred, average='macro')
recall = recall_score(y_test,y_pred, average='macro')
f1 = f1_score(y_test,y_pred, average='macro')
Train_Accuracy = model.evaluate(X_train, y_train)
Test_Accuracy = model.evaluate(X_test, y_test)
# base_1 = []
base_1.append(['LSTM', Train_Accuracy[1], Test_Accuracy[1], precision, recall, f1])
model_comparison = pd.DataFrame(base_1,columns=['Model','Train Accuracy','Test Accuracy','Precision','Recall','F1 Score'])
model_comparison.sort_values(by=['Recall','F1 Score'], inplace=True, ascending=False)
# Build the model
model = LogisticRegression()
# Train the model
model.fit(X_train, y_train)
y_pred = model.predict(X_test)
# Classification Accuracy
print("Classification Accuracy:")
print('Accuracy on Training data:',model.score(X_train, y_train))
print('Accuracy on Test data:',model.score(X_test, y_test))
print()
# Classification Report
print("Classification Report:\n",classification_report(y_test, y_pred))
# Confusion Matrix
print("Confusion Matrix Chart:")
cm = confusion_matrix(y_test, y_pred)
df_cm = pd.DataFrame(cm, index = [i for i in ['0', '1']],
columns = [i for i in ['0', '1']])
plt.figure(figsize = (12,10))
sns.heatmap(df_cm, annot=True, fmt='g')
plt.title('Confusion Matrix')
plt.ylabel('Actual Label')
plt.xlabel('Predicted Label')
plt.show()
# Model comparison
precision = precision_score(y_test,y_pred, average='macro')
recall = recall_score(y_test,y_pred, average='macro')
f1 = f1_score(y_test,y_pred, average='macro')
Train_Accuracy = model.score(X_train, y_train)
Test_Accuracy = model.score(X_test, y_test)
# base_1 = []
base_1.append(['Logistic Regression', Train_Accuracy, Test_Accuracy, precision, recall, f1])
model_comparison = pd.DataFrame(base_1,columns=['Model','Train Accuracy','Test Accuracy','Precision','Recall','F1 Score'])
model_comparison.sort_values(by=['Recall','F1 Score'], inplace=True, ascending=False)
# Build the model
model = KNeighborsClassifier()
# Train the model
model.fit(X_train, y_train)
y_pred = model.predict(X_test)
# Classification Accuracy
print("Classification Accuracy:")
print('Accuracy on Training data:',model.score(X_train, y_train))
print('Accuracy on Test data:',model.score(X_test, y_test))
print()
# Classification Report
print("Classification Report:\n",classification_report(y_test, y_pred))
# Confusion Matrix
print("Confusion Matrix Chart:")
cm = confusion_matrix(y_test, y_pred)
df_cm = pd.DataFrame(cm, index = [i for i in ['0', '1']],
columns = [i for i in ['0', '1']])
plt.figure(figsize = (12,10))
sns.heatmap(df_cm, annot=True, fmt='g')
plt.title('Confusion Matrix')
plt.ylabel('Actual Label')
plt.xlabel('Predicted Label')
plt.show()
# Model comparison
precision = precision_score(y_test,y_pred, average='macro')
recall = recall_score(y_test,y_pred, average='macro')
f1 = f1_score(y_test,y_pred, average='macro')
Train_Accuracy = model.score(X_train, y_train)
Test_Accuracy = model.score(X_test, y_test)
# base_1 = []
base_1.append(['K Neighbors', Train_Accuracy, Test_Accuracy, precision, recall, f1])
model_comparison = pd.DataFrame(base_1,columns=['Model','Train Accuracy','Test Accuracy','Precision','Recall','F1 Score'])
model_comparison.sort_values(by=['Recall','F1 Score'], inplace=True, ascending=False)
# Build the model
model = SVC()
# Train the model
model.fit(X_train, y_train)
y_pred = model.predict(X_test)
# Classification Accuracy
print("Classification Accuracy:")
print('Accuracy on Training data:',model.score(X_train, y_train))
print('Accuracy on Test data:',model.score(X_test, y_test))
print()
# Classification Report
print("Classification Report:\n",classification_report(y_test, y_pred))
# Confusion Matrix
print("Confusion Matrix Chart:")
cm = confusion_matrix(y_test, y_pred)
df_cm = pd.DataFrame(cm, index = [i for i in ['0', '1']],
columns = [i for i in ['0', '1']])
plt.figure(figsize = (12,10))
sns.heatmap(df_cm, annot=True, fmt='g')
plt.title('Confusion Matrix')
plt.ylabel('Actual Label')
plt.xlabel('Predicted Label')
plt.show()
# Model comparison
precision = precision_score(y_test,y_pred, average='macro')
recall = recall_score(y_test,y_pred, average='macro')
f1 = f1_score(y_test,y_pred, average='macro')
Train_Accuracy = model.score(X_train, y_train)
Test_Accuracy = model.score(X_test, y_test)
# base_1 = []
base_1.append(['SVM', Train_Accuracy, Test_Accuracy, precision, recall, f1])
model_comparison = pd.DataFrame(base_1,columns=['Model','Train Accuracy','Test Accuracy','Precision','Recall','F1 Score'])
model_comparison.sort_values(by=['Recall','F1 Score'], inplace=True, ascending=False)
# Build the model
model = MultinomialNB()
# Train the model
model.fit(X_train, y_train)
y_pred = model.predict(X_test)
# Classification Accuracy
print("Classification Accuracy:")
print('Accuracy on Training data:',model.score(X_train, y_train))
print('Accuracy on Test data:',model.score(X_test, y_test))
print()
# Classification Report
print("Classification Report:\n",classification_report(y_test, y_pred))
# Confusion Matrix
print("Confusion Matrix Chart:")
cm = confusion_matrix(y_test, y_pred)
df_cm = pd.DataFrame(cm, index = [i for i in ['0', '1']],
columns = [i for i in ['0', '1']])
plt.figure(figsize = (12,10))
sns.heatmap(df_cm, annot=True, fmt='g')
plt.title('Confusion Matrix')
plt.ylabel('Actual Label')
plt.xlabel('Predicted Label')
plt.show()
# Model comparison
precision = precision_score(y_test,y_pred, average='macro')
recall = recall_score(y_test,y_pred, average='macro')
f1 = f1_score(y_test,y_pred, average='macro')
Train_Accuracy = model.score(X_train, y_train)
Test_Accuracy = model.score(X_test, y_test)
# base_1 = []
base_1.append(['Multinomial NB', Train_Accuracy, Test_Accuracy, precision, recall, f1])
model_comparison = pd.DataFrame(base_1,columns=['Model','Train Accuracy','Test Accuracy','Precision','Recall','F1 Score'])
model_comparison.sort_values(by=['Recall','F1 Score'], inplace=True, ascending=False)
# Build the model
model = DecisionTreeClassifier()
# Train the model
model.fit(X_train, y_train)
y_pred = model.predict(X_test)
# Classification Accuracy
print("Classification Accuracy:")
print('Accuracy on Training data:',model.score(X_train, y_train))
print('Accuracy on Test data:',model.score(X_test, y_test))
print()
# Classification Report
print("Classification Report:\n",classification_report(y_test, y_pred))
# Confusion Matrix
print("Confusion Matrix Chart:")
cm = confusion_matrix(y_test, y_pred)
df_cm = pd.DataFrame(cm, index = [i for i in ['0', '1']],
columns = [i for i in ['0', '1']])
plt.figure(figsize = (12,10))
sns.heatmap(df_cm, annot=True, fmt='g')
plt.title('Confusion Matrix')
plt.ylabel('Actual Label')
plt.xlabel('Predicted Label')
plt.show()
# Model comparison
precision = precision_score(y_test,y_pred, average='macro')
recall = recall_score(y_test,y_pred, average='macro')
f1 = f1_score(y_test,y_pred, average='macro')
Train_Accuracy = model.score(X_train, y_train)
Test_Accuracy = model.score(X_test, y_test)
# base_1 = []
base_1.append(['Decision Tree', Train_Accuracy, Test_Accuracy, precision, recall, f1])
model_comparison = pd.DataFrame(base_1,columns=['Model','Train Accuracy','Test Accuracy','Precision','Recall','F1 Score'])
model_comparison.sort_values(by=['Recall','F1 Score'], inplace=True, ascending=False)
# Build the model
model = RandomForestClassifier()
# Train the model
model.fit(X_train, y_train)
y_pred = model.predict(X_test)
# Classification Accuracy
print("Classification Accuracy:")
print('Accuracy on Training data:',model.score(X_train, y_train))
print('Accuracy on Test data:',model.score(X_test, y_test))
print()
# Classification Report
print("Classification Report:\n",classification_report(y_test, y_pred))
# Confusion Matrix
print("Confusion Matrix Chart:")
cm = confusion_matrix(y_test, y_pred)
df_cm = pd.DataFrame(cm, index = [i for i in ['0', '1']],
columns = [i for i in ['0', '1']])
plt.figure(figsize = (12,10))
sns.heatmap(df_cm, annot=True, fmt='g')
plt.title('Confusion Matrix')
plt.ylabel('Actual Label')
plt.xlabel('Predicted Label')
plt.show()
# Model comparison
precision = precision_score(y_test,y_pred, average='macro')
recall = recall_score(y_test,y_pred, average='macro')
f1 = f1_score(y_test,y_pred, average='macro')
Train_Accuracy = model.score(X_train, y_train)
Test_Accuracy = model.score(X_test, y_test)
# base_1 = []
base_1.append(['Random Forest', Train_Accuracy, Test_Accuracy, precision, recall, f1])
model_comparison = pd.DataFrame(base_1,columns=['Model','Train Accuracy','Test Accuracy','Precision','Recall','F1 Score'])
model_comparison.sort_values(by=['Recall','F1 Score'], inplace=True, ascending=False)
# Build the model
model = AdaBoostClassifier()
# Train the model
model.fit(X_train, y_train)
y_pred = model.predict(X_test)
# Classification Accuracy
print("Classification Accuracy:")
print('Accuracy on Training data:',model.score(X_train, y_train))
print('Accuracy on Test data:',model.score(X_test, y_test))
print()
# Classification Report
print("Classification Report:\n",classification_report(y_test, y_pred))
# Confusion Matrix
print("Confusion Matrix Chart:")
cm = confusion_matrix(y_test, y_pred)
df_cm = pd.DataFrame(cm, index = [i for i in ['0', '1']],
columns = [i for i in ['0', '1']])
plt.figure(figsize = (12,10))
sns.heatmap(df_cm, annot=True, fmt='g')
plt.title('Confusion Matrix')
plt.ylabel('Actual Label')
plt.xlabel('Predicted Label')
plt.show()
# Model comparison
precision = precision_score(y_test,y_pred, average='macro')
recall = recall_score(y_test,y_pred, average='macro')
f1 = f1_score(y_test,y_pred, average='macro')
Train_Accuracy = model.score(X_train, y_train)
Test_Accuracy = model.score(X_test, y_test)
# base_1 = []
base_1.append(['Ada Boost', Train_Accuracy, Test_Accuracy, precision, recall, f1])
model_comparison = pd.DataFrame(base_1,columns=['Model','Train Accuracy','Test Accuracy','Precision','Recall','F1 Score'])
model_comparison.sort_values(by=['Recall','F1 Score'], inplace=True, ascending=False)
model_comparison
# Install Transformers library
!pip install transformers
# Load the BERT Classifier and Tokenizer alıng with Input modules
from transformers import BertTokenizer, TFBertForSequenceClassification
from transformers import InputExample, InputFeatures
model = TFBertForSequenceClassification.from_pretrained("bert-base-uncased")
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")
# We have the main BERT model, a dropout layer to prevent overfitting, and finally a dense layer for classification task:
model.summary()
# We have two pandas Dataframe objects waiting for us to convert them into suitable objects for the BERT model.
# We will take advantage of the InputExample function that helps us to create sequences from our dataset.
# The InputExample function can be called as follows:
InputExample(guid=None,
text_a = "Hello, world",
text_b = None,
label = 1)
Now we will create two main functions:
def convert_data_to_examples(train, test, processed, label):
train_InputExamples = train.apply(lambda x: InputExample(guid=None, # Globally unique ID for bookkeeping, unused in this case
text_a = x[processed],
text_b = None,
label = x[label]), axis = 1)
validation_InputExamples = test.apply(lambda x: InputExample(guid=None, # Globally unique ID for bookkeeping, unused in this case
text_a = x[processed],
text_b = None,
label = x[label]), axis = 1)
return train_InputExamples, validation_InputExamples
train_InputExamples, validation_InputExamples = convert_data_to_examples(train,
test,
'processed',
'label')
def convert_examples_to_tf_dataset(examples, tokenizer, max_length=128):
features = [] # -> will hold InputFeatures to be converted later
for e in examples:
# Documentation is really strong for this method, so please take a look at it
input_dict = tokenizer.encode_plus(
e.text_a,
add_special_tokens=True,
max_length=max_length, # truncates if len(s) > max_length
return_token_type_ids=True,
return_attention_mask=True,
pad_to_max_length=True, # pads to the right by default # CHECK THIS for pad_to_max_length
truncation=True
)
input_ids, token_type_ids, attention_mask = (input_dict["input_ids"],
input_dict["token_type_ids"], input_dict['attention_mask'])
features.append(
InputFeatures(
input_ids=input_ids, attention_mask=attention_mask, token_type_ids=token_type_ids, label=e.label
)
)
def gen():
for f in features:
yield (
{
"input_ids": f.input_ids,
"attention_mask": f.attention_mask,
"token_type_ids": f.token_type_ids,
},
f.label,
)
return tf.data.Dataset.from_generator(
gen,
({"input_ids": tf.int32, "attention_mask": tf.int32, "token_type_ids": tf.int32}, tf.int64),
(
{
"input_ids": tf.TensorShape([None]),
"attention_mask": tf.TensorShape([None]),
"token_type_ids": tf.TensorShape([None]),
},
tf.TensorShape([]),
),
)
processed = 'processed'
label = 'label'
# Our dataset containing processed input sequences are ready to be fed to the model.
train_InputExamples, validation_InputExamples = convert_data_to_examples(train, test, processed, label)
train_data = convert_examples_to_tf_dataset(list(train_InputExamples), tokenizer)
train_data = train_data.shuffle(100).batch(32).repeat(2)
validation_data = convert_examples_to_tf_dataset(list(validation_InputExamples), tokenizer)
validation_data = validation_data.batch(32)
# We will use Adam as our optimizer, CategoricalCrossentropy as our loss function, and SparseCategoricalAccuracy as our accuracy metric.
# Fine-tuning the model for 2 epochs will give us around 90% accuracy, which is great.
# Training the model might take a while, so ensure you enabled the GPU acceleration from the Notebook Settings.
%%time
model.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=3e-5, epsilon=1e-08, clipnorm=1.0),
loss=tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True),
metrics=[tf.keras.metrics.SparseCategoricalAccuracy('accuracy')])
H = model.fit(train_data, epochs=2, validation_data=validation_data)
# 50 min for maxlen = 128
# Making Predictions
# I created a list of two reviews I created. The first one is a positive review, while the second one is clearly negative.
pred_sentences = ['This was an awesome movie. I watch it twice my time watching this beautiful movie if I have known it was this good',
'One of the worst movies of all time. I cannot believe I wasted two hours of my life for this movie']
# We need to tokenize our reviews with our pre-trained BERT tokenizer. We will then feed these tokenized sequences to our model
# and run a final softmax layer to get the predictions. We can then use the argmax function to determine whether our sentiment
# prediction for the review is positive or negative. Finally, we will print out the results with a simple for loop.
# The following lines do all of these said operations:
tf_batch = tokenizer(pred_sentences, max_length=128, padding=True, truncation=True, return_tensors='tf')
tf_outputs = model(tf_batch)
tf_predictions = tf.nn.softmax(tf_outputs[0], axis=-1)
labels = ['Negative','Positive']
label = tf.argmax(tf_predictions, axis=1)
label = label.numpy()
for i in range(len(pred_sentences)):
print(pred_sentences[i], ": \n", labels[label[i]])
# Using the BERT on 5 test samples
predict_set = test[0:5]
pred_sentences = list(predict_set['processed'])
tf_batch = tokenizer(pred_sentences, max_length=128, padding=True, truncation=True, return_tensors='tf')
tf_outputs = model(tf_batch)
tf_predictions = tf.nn.softmax(tf_outputs[0], axis=-1)
labels = ['Negative','Positive']
label = tf.argmax(tf_predictions, axis=1)
label = label.numpy()
for i in range(len(pred_sentences)):
print(pred_sentences[i], ": \n", labels[label[i]])
In this project, we have learned how to clean and prepare the text data to feed into various ML/DL Models.
We have compared the performance of various ML/DL models with precision, recall, F1 and Accuracies (Train and Test).
There are several ideas that we can try to improve the model performance:
DATA DESCRIPTION: The dataset is collected from two news websites, theonion.com and huffingtonpost.com. This new dataset has the following advantages over the existing Twitter datasets:
Content: Each record consists of three attributes:
PROJECT OBJECTIVE: Build a sequential NLP classifier which can use input text parameters to determine the customer sentiments.
# Import all the relevant libraries needed to complete the analysis, visualization, modeling and presentation
import pandas as pd
import numpy as np
import os
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline
sns.set_style('darkgrid')
from scipy import stats
from scipy.stats import zscore
from sklearn import preprocessing
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import RobustScaler
from sklearn import model_selection
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn.model_selection import RandomizedSearchCV, StratifiedKFold, KFold
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
from sklearn.metrics import ConfusionMatrixDisplay, precision_score, recall_score
from sklearn.metrics import precision_recall_curve, roc_curve, auc, roc_auc_score
from sklearn.metrics import plot_precision_recall_curve, average_precision_score
from sklearn.metrics import f1_score, plot_roc_curve
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LinearRegression, LogisticRegression, RidgeClassifier, SGDClassifier
from sklearn.naive_bayes import GaussianNB, BernoulliNB, MultinomialNB
from sklearn.svm import SVC
# from sklearn.decomposition import PCA
# from scipy.cluster.hierarchy import dendrogram, linkage
# from scipy.cluster.hierarchy import fcluster
# from sklearn.cluster import KMeans
# from sklearn.metrics import silhouette_samples, silhouette_score
# import xgboost as xgb
# from xgboost import plot_importance
# from lightgbm import LGBMClassifier
# from sklearn.tree import DecisionTreeClassifier
# from sklearn.ensemble import RandomForestClassifier
# from sklearn.ensemble import AdaBoostClassifier
# from sklearn.ensemble import BaggingClassifier
# from sklearn.ensemble import GradientBoostingClassifier
# from sklearn.ensemble import VotingClassifier
# from imblearn.over_sampling import RandomOverSampler
# from imblearn.over_sampling import SMOTENC, SMOTE, ADASYN
# from imblearn.under_sampling import RandomUnderSampler
import re
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import CountVectorizer
from nltk.stem.snowball import SnowballStemmer
import pandas_profiling as pp
import gensim
import logging
# import cv2
# from google.colab.patches import cv2_imshow
# from glob import glob
# import itertools
import tensorflow as tf
from tensorflow.keras.models import Sequential, Model
from tensorflow.keras.layers import Dense, Dropout, Flatten, Conv2D, MaxPool2D
from tensorflow.keras.layers import Activation, GlobalMaxPool2D, GlobalAveragePooling2D
from tensorflow.keras.layers import UpSampling2D, Input, Concatenate
from tensorflow.keras.layers import BatchNormalization, LeakyReLU
from tensorflow.keras.optimizers import Adam, RMSprop, SGD, Adagrad
from tensorflow.keras.applications import MobileNetV2
from tensorflow.keras.callbacks import EarlyStopping, ReduceLROnPlateau, ModelCheckpoint
from tensorflow.keras.metrics import Recall, Precision
from tensorflow.keras import backend as K
from tensorflow import keras
from keras.utils.np_utils import to_categorical
from keras.utils import np_utils
from tensorflow.keras.preprocessing.image import ImageDataGenerator
from tensorflow.keras.wrappers.scikit_learn import KerasClassifier, KerasRegressor
import warnings
warnings.filterwarnings("ignore")
import random
from zipfile import ZipFile
# Set random_state
random_state = 42
# pd.set_option('display.max_columns', None)
# pd.set_option('display.max_rows', None)
from google.colab import drive
drive.mount('/content/drive')
# Current working directory
%cd "/content/drive/MyDrive/MGL/Project-NLP-2/"
# # List all the files in a directory
# for dirname, _, filenames in os.walk('path'):
# for filename in filenames:
# print(os.path.join(dirname, filename))
# List files in the directory
!ls
# # Path of the data file
# path = 'Sarcasm_Headlines_Dataset_v2.json.zip'
# # Unzip files in the current directory
# with ZipFile (path,'r') as z:
# z.extractall()
# print("Training zip extraction done!")
# Import the dataset
# Creat dataframe from the json file
df = pd.read_json('Sarcasm_Headlines_Dataset_v2.json', lines=True)
df.shape
pd.set_option('display.max_colwidth', None)
df.info()
df.head()
# As the dataset is large; use a subset of the data. Let's Check what is working on the local machine.
# Can use 10,000/100,000 later
# df = pd.read_csv("blogtext.csv", nrows=1000)
# df = df.sample(n=10000, random_state = 0)
# df.info()
# Check for unique values: 1 = Sarcastic, 0 = Not Sarcastic
df.is_sarcastic.value_counts()
# Check for NaN values
df.isna().sum()
# Describe function generates descriptive statistics that summarize the central tendency,
# dispersion and shape of a dataset’s distribution, excluding NaN values.
# This method tells us a lot of things about a dataset. One important thing is that
# the describe() method deals only with numeric values. It doesn't work with any
# categorical values. So if there are any categorical values in a column the describe()
# method will ignore it and display summary for the other columns.
df.describe(include='all').transpose()
# Clear the matplotlib plotting backend
%matplotlib inline
plt.close('all')
# Understand the 'sentiment' the target vector
f,axes=plt.subplots(1,2,figsize=(17,7))
df['is_sarcastic'].value_counts().plot.pie(autopct='%1.1f%%',ax=axes[0])
sns.countplot('is_sarcastic',data=df,ax=axes[1])
axes[0].set_title('Pie Chart for sarcasm')
axes[1].set_title('Bar Graph for sarcasm')
plt.show()
So, We can see that the dataset is balanced. Its good for a classification task.
df = df[['headline', 'is_sarcastic']]
df.head()
from wordcloud import WordCloud, STOPWORDS, ImageColorGenerator
import plotly.express as px
from plotly.offline import init_notebook_mode
import re
import nltk
from nltk.corpus import stopwords
from tqdm import tqdm
from nltk.stem import WordNetLemmatizer
import spacy
tqdm.pandas()
spacy_eng = spacy.load("en_core_web_sm")
nltk.download('stopwords')
lemm = WordNetLemmatizer()
init_notebook_mode(connected=True)
sns.set_style("darkgrid")
plt.rcParams['figure.figsize'] = (20,8)
plt.rcParams['font.size'] = 18
nltk.download('all')
# Text Cleaning:
# We will not remove numbers from the text data right away, lets further analyse if they contain any relevant information
# We can find the entity type of the tokens in the sentences using Named Entity Recognition (NER), this will help us identify
# the type and relevance of numbers in our text data
stop_words = stopwords.words('english')
stop_words.remove('not')
def text_cleaning(x):
headline = re.sub('\s+\n+', ' ', x)
headline = re.sub('[^a-zA-Z0-9]', ' ', x)
headline = headline.lower()
headline = headline.split()
headline = [lemm.lemmatize(word, "v") for word in headline if not word in stop_words]
headline = ' '.join(headline)
return headline
def get_entities(x):
entity = []
text = spacy_eng(x)
for word in text.ents:
entity.append(word.label_)
return ",".join(entity)
df['entity'] = df['headline'].progress_apply(get_entities)
nltk.download('wordnet')
# Dataset with entity, clean_headline and sentence_length
df['clean_headline'] = df['headline'].apply(text_cleaning)
df['sentence_length'] = df['clean_headline'].apply(lambda x: len(x.split()))
df
# Headline length distribution
# Check for outliers in headline column
# Generally the headlines shouldn't be more than 20-40 words
# Box Plot
fig = px.histogram(df, x="sentence_length",height=700, color='is_sarcastic', title="Headlines Length Distribution", marginal="box")
fig.show(renderer="colab")
df[df['sentence_length']==107]['headline']
df.drop(df[df['sentence_length'] == 107].index, inplace = True)
df.reset_index(inplace=True, drop=True)
# Headline length distribution: Outliers Removed
# The headlines after the removal of outliers do not exceed the limit of 20-40 words
# They are mostly centered in the range of 5-10 words
fig = px.histogram(df, x="sentence_length",height=700, color='is_sarcastic', title="Headlines Length Distribution", marginal="box")
fig.show(renderer="colab")
# Filtering: Find Sentences that Contain Numbers
df['contains_number'] = df['clean_headline'].apply(lambda x: bool(re.search(r'\d+', x)))
df
Analysis of samples containing numbers of Time, Date or Cardinal Entity:
# Date Entity: Randome Samples
df[(df['contains_number']) & (df['sentence_length']<=5) & (df['entity']=='DATE')].sample(10)
# Time Entity: Randome Samples
df[(df['contains_number']) & (df['sentence_length']<=5) & (df['entity']=='TIME')].sample(10)
# Cardinal Entity: Randome Samples
df[(df['contains_number']) & (df['sentence_length']<=5) & (df['entity']=='CARDINAL')].sample(10)
Inference from NER:
# Wordcloud for text that is Not Sarcastic (LABEL - 0)
plt.figure(figsize = (20,20)) # Text that is Not Sarcastic
wc = WordCloud(max_words = 2000 , width = 1600 , height = 800).generate(" ".join(df[df.is_sarcastic == 0].headline))
plt.imshow(wc , interpolation = 'bilinear')
# Wordcloud for text that is Sarcastic (LABEL - 1)
plt.figure(figsize = (20,20)) # Text that is Sarcastic
wc = WordCloud(max_words = 2000 , width = 1600 , height = 800).generate(" ".join(df[df.is_sarcastic == 1].headline))
plt.imshow(wc , interpolation = 'bilinear')
Hint: Be analytical and experimental here in trying new approaches to design the best model.
Considering the above 4, 5, 6, 7, 8, 9, 10 parts together in below code cells:
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.models import Sequential, Model
from tensorflow.keras import layers
from tensorflow.keras.layers import Embedding, Layer, Dense, Dropout, LayerNormalization, Input, GlobalAveragePooling1D
from tensorflow.keras.layers import LSTM, Bidirectional, SimpleRNN, GRU, Conv1D, MultiHeadAttention, AveragePooling1D
from tensorflow.keras.callbacks import ModelCheckpoint, EarlyStopping, ReduceLROnPlateau
X = df['clean_headline']
y = df['is_sarcastic']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.30, random_state=42)
# Tokenization
# Splitting sentences into words
# Finding the vocab size
# Important Parameters to consider
max_len = 20
embedding_dim = 50
oov_token = '00_V'
padding_type = 'post'
trunc_type = 'post'
tokenizer = Tokenizer()
tokenizer.fit_on_texts(X_train)
vocab_size = len(tokenizer.word_index) + 1
print("Vocab Size: ",vocab_size)
# Encoding of Inputs
# Converting the sentences to token followed by padded sequences in encoded format
# These are numeric encodings assigned to each word
train_sequences = tokenizer.texts_to_sequences(X_train)
X_train = pad_sequences(train_sequences, maxlen=max_len, padding=padding_type, truncating=trunc_type)
test_sequences = tokenizer.texts_to_sequences(X_test)
X_test = pad_sequences(test_sequences, maxlen=max_len, padding=padding_type, truncating=trunc_type)
X_train[0]
y_train[0]
# # Path of the data file
# path = 'glove.6B.zip'
# # Unzip files in the current directory
# with ZipFile (path,'r') as z:
# z.extractall()
# print("Training zip extraction done!")
# Embedding matrix with 50 dimensions
from numpy import array
from numpy import asarray
from numpy import zeros
embeddings_dictionary = dict()
glove_file = open('glove.6B.50d.txt', encoding="utf8")
for line in glove_file:
records = line.split()
word = records[0]
vector_dimensions = asarray(records[1:], dtype='float32')
embeddings_dictionary [word] = vector_dimensions
glove_file.close()
vocab_size = len(tokenizer.word_index)+1
# Creating a embedding matrix for initial weights based on the precreated glove embedding
embedding_matrix = zeros((vocab_size, 50))
for word, index in tokenizer.word_index.items():
embedding_vector = embeddings_dictionary.get(word)
if embedding_vector is not None:
embedding_matrix[index] = embedding_vector
model = keras.Sequential()
model.add(keras.layers.Embedding(vocab_size, embedding_dim, weights=[embedding_matrix], input_length = max_len))
model.add(keras.layers.GlobalAveragePooling1D())
model.add(keras.layers.Dense(64, activation=tf.nn.relu))
model.add(keras.layers.Dense(1, activation=tf.nn.sigmoid))
model.summary()
model.compile(loss = 'binary_crossentropy', optimizer = 'adam', metrics = ['accuracy'])
H = model.fit(X_train, y_train, epochs = 10, batch_size = 128, validation_data = (X_test, y_test))
plt.figure(figsize = (12,5))
plt.subplot(1,2,1)
plt.plot(H.history['accuracy'], label = 'Train')
plt.plot(H.history['val_accuracy'], label = 'Validation')
plt.legend()
plt.title('Accuracy')
plt.subplot(1,2,2)
plt.plot(H.history['loss'], label = 'Train')
plt.plot(H.history['val_loss'], label = 'Validation')
plt.legend()
plt.title('Loss')
plt.show()
y_pred_proba = model.predict(X_test)
y_pred = np.array([0 if proba < 0.5 else 1 for proba in y_pred_proba])
# Classification Accuracy
print("Classification Accuracy:")
print('Loss and Accuracy on Training data:',model.evaluate(X_train, y_train))
print('Loss and Accuracy on Test data:',model.evaluate(X_test, y_test))
print()
# Classification Report
print("Classification Report:\n",classification_report(y_test, y_pred))
# Confusion Matrix
print("Confusion Matrix Chart:")
cm = confusion_matrix(y_test, y_pred)
df_cm = pd.DataFrame(cm, index = [i for i in ['0', '1']],
columns = [i for i in ['0', '1']])
plt.figure(figsize = (12,10))
sns.heatmap(df_cm, annot=True, fmt='g')
plt.title('Confusion Matrix')
plt.ylabel('Actual Label')
plt.xlabel('Predicted Label')
plt.show()
# Model comparison
precision = precision_score(y_test,y_pred, average='macro')
recall = recall_score(y_test,y_pred, average='macro')
f1 = f1_score(y_test,y_pred, average='macro')
Train_Accuracy = model.evaluate(X_train, y_train)
Test_Accuracy = model.evaluate(X_test, y_test)
base_1 = []
base_1.append(['ANN', Train_Accuracy[1], Test_Accuracy[1], precision, recall, f1])
model_comparison = pd.DataFrame(base_1,columns=['Model','Train Accuracy','Test Accuracy','Precision','Recall','F1 Score'])
model_comparison.sort_values(by=['Recall','F1 Score'], inplace=True, ascending=False)
model = Sequential()
model.add(Embedding(vocab_size, embedding_dim, weights=[embedding_matrix], input_length = max_len))
model.add(Conv1D(filters = 32, kernel_size = 3, padding = 'same', activation = 'relu'))
model.add(AveragePooling1D(pool_size = 2))
# model.add(Bidirectional(SimpleRNN(64, dropout = 0.5)))
model.add((SimpleRNN(64, dropout = 0.5)))
model.add(Dense(1, activation = 'sigmoid'))
model.compile(loss = 'binary_crossentropy', optimizer = 'adam', metrics = ['accuracy'])
H = model.fit(X_train, y_train, epochs = 10, batch_size = 128, validation_data = (X_test, y_test), verbose=1)
plt.figure(figsize = (12,5))
plt.subplot(1,2,1)
plt.plot(H.history['accuracy'], label = 'Train')
plt.plot(H.history['val_accuracy'], label = 'Validation')
plt.legend()
plt.title('Accuracy')
plt.subplot(1,2,2)
plt.plot(H.history['loss'], label = 'Train')
plt.plot(H.history['val_loss'], label = 'Validation')
plt.legend()
plt.title('Loss')
plt.show()
y_pred_proba = model.predict(X_test)
y_pred = np.array([0 if proba < 0.5 else 1 for proba in y_pred_proba])
# Classification Accuracy
print("Classification Accuracy:")
print('Loss and Accuracy on Training data:',model.evaluate(X_train, y_train))
print('Loss and Accuracy on Test data:',model.evaluate(X_test, y_test))
print()
# Classification Report
print("Classification Report:\n",classification_report(y_test, y_pred))
# Confusion Matrix
print("Confusion Matrix Chart:")
cm = confusion_matrix(y_test, y_pred)
df_cm = pd.DataFrame(cm, index = [i for i in ['0', '1']],
columns = [i for i in ['0', '1']])
plt.figure(figsize = (12,10))
sns.heatmap(df_cm, annot=True, fmt='g')
plt.title('Confusion Matrix')
plt.ylabel('Actual Label')
plt.xlabel('Predicted Label')
plt.show()
# Model comparison
precision = precision_score(y_test,y_pred, average='macro')
recall = recall_score(y_test,y_pred, average='macro')
f1 = f1_score(y_test,y_pred, average='macro')
Train_Accuracy = model.evaluate(X_train, y_train)
Test_Accuracy = model.evaluate(X_test, y_test)
# base_1 = []
base_1.append(['RNN', Train_Accuracy[1], Test_Accuracy[1], precision, recall, f1])
model_comparison = pd.DataFrame(base_1,columns=['Model','Train Accuracy','Test Accuracy','Precision','Recall','F1 Score'])
model_comparison.sort_values(by=['Recall','F1 Score'], inplace=True, ascending=False)
model = Sequential()
model.add(Embedding(vocab_size, embedding_dim, weights=[embedding_matrix], input_length = max_len))
model.add(Conv1D(filters = 32, kernel_size = 3, padding = 'same', activation = 'relu'))
model.add(AveragePooling1D(pool_size = 2))
# model.add(Bidirectional(GRU(32, dropout = 0.5)))
model.add((GRU(64, dropout = 0.5)))
model.add(Dense(1, activation = 'sigmoid'))
model.compile(loss = 'binary_crossentropy', optimizer = 'adam', metrics = ['accuracy'])
H = model.fit(X_train, y_train, epochs = 10, batch_size = 128, validation_data = (X_test, y_test), verbose=1)
plt.figure(figsize = (12,5))
plt.subplot(1,2,1)
plt.plot(H.history['accuracy'], label = 'Train')
plt.plot(H.history['val_accuracy'], label = 'Validation')
plt.legend()
plt.title('Accuracy')
plt.subplot(1,2,2)
plt.plot(H.history['loss'], label = 'Train')
plt.plot(H.history['val_loss'], label = 'Validation')
plt.legend()
plt.title('Loss')
plt.show()
y_pred_proba = model.predict(X_test)
y_pred = np.array([0 if proba < 0.5 else 1 for proba in y_pred_proba])
# Classification Accuracy
print("Classification Accuracy:")
print('Loss and Accuracy on Training data:',model.evaluate(X_train, y_train))
print('Loss and Accuracy on Test data:',model.evaluate(X_test, y_test))
print()
# Classification Report
print("Classification Report:\n",classification_report(y_test, y_pred))
# Confusion Matrix
print("Confusion Matrix Chart:")
cm = confusion_matrix(y_test, y_pred)
df_cm = pd.DataFrame(cm, index = [i for i in ['0', '1']],
columns = [i for i in ['0', '1']])
plt.figure(figsize = (12,10))
sns.heatmap(df_cm, annot=True, fmt='g')
plt.title('Confusion Matrix')
plt.ylabel('Actual Label')
plt.xlabel('Predicted Label')
plt.show()
# Model comparison
precision = precision_score(y_test,y_pred, average='macro')
recall = recall_score(y_test,y_pred, average='macro')
f1 = f1_score(y_test,y_pred, average='macro')
Train_Accuracy = model.evaluate(X_train, y_train)
Test_Accuracy = model.evaluate(X_test, y_test)
# base_1 = []
base_1.append(['GRU', Train_Accuracy[1], Test_Accuracy[1], precision, recall, f1])
model_comparison = pd.DataFrame(base_1,columns=['Model','Train Accuracy','Test Accuracy','Precision','Recall','F1 Score'])
model_comparison.sort_values(by=['Recall','F1 Score'], inplace=True, ascending=False)
model = Sequential()
model.add(Embedding(vocab_size, embedding_dim, weights=[embedding_matrix], input_length = max_len))
model.add(Conv1D(filters = 32, kernel_size = 3, padding = 'same', activation = 'relu'))
model.add(AveragePooling1D(pool_size = 2))
model.add(Bidirectional(LSTM(64, dropout = 0.5)))
# model.add((LSTM(32, dropout = 0.5)))
model.add(Dense(1, activation = 'sigmoid'))
model.compile(loss = 'binary_crossentropy', optimizer = 'adam', metrics = ['accuracy'])
H = model.fit(X_train, y_train, epochs = 10, batch_size = 128, validation_data = (X_test, y_test), verbose=1)
plt.figure(figsize = (12,5))
plt.subplot(1,2,1)
plt.plot(H.history['accuracy'], label = 'Train')
plt.plot(H.history['val_accuracy'], label = 'Validation')
plt.legend()
plt.title('Accuracy')
plt.subplot(1,2,2)
plt.plot(H.history['loss'], label = 'Train')
plt.plot(H.history['val_loss'], label = 'Validation')
plt.legend()
plt.title('Loss')
plt.show()
y_pred_proba = model.predict(X_test)
y_pred = np.array([0 if proba < 0.5 else 1 for proba in y_pred_proba])
# Classification Accuracy
print("Classification Accuracy:")
print('Loss and Accuracy on Training data:',model.evaluate(X_train, y_train))
print('Loss and Accuracy on Test data:',model.evaluate(X_test, y_test))
print()
# Classification Report
print("Classification Report:\n",classification_report(y_test, y_pred))
# Confusion Matrix
print("Confusion Matrix Chart:")
cm = confusion_matrix(y_test, y_pred)
df_cm = pd.DataFrame(cm, index = [i for i in ['0', '1']],
columns = [i for i in ['0', '1']])
plt.figure(figsize = (12,10))
sns.heatmap(df_cm, annot=True, fmt='g')
plt.title('Confusion Matrix')
plt.ylabel('Actual Label')
plt.xlabel('Predicted Label')
plt.show()
# Model comparison
precision = precision_score(y_test,y_pred, average='macro')
recall = recall_score(y_test,y_pred, average='macro')
f1 = f1_score(y_test,y_pred, average='macro')
Train_Accuracy = model.evaluate(X_train, y_train)
Test_Accuracy = model.evaluate(X_test, y_test)
# base_1 = []
base_1.append(['LSTM', Train_Accuracy[1], Test_Accuracy[1], precision, recall, f1])
model_comparison = pd.DataFrame(base_1,columns=['Model','Train Accuracy','Test Accuracy','Precision','Recall','F1 Score'])
model_comparison.sort_values(by=['Recall','F1 Score'], inplace=True, ascending=False)
model_comparison
df1 = df[['clean_headline', 'is_sarcastic']]
df1.head()
# Split the data for training and testing
# To be used in the transformers (BERT)
train, test = train_test_split(df1, test_size=0.5, random_state=0)
import locale
def getpreferredencoding(do_setlocale = True):
return "UTF-8"
locale.getpreferredencoding = getpreferredencoding
# Install Transformers library
!pip install transformers
# Load the BERT Classifier and Tokenizer alıng with Input modules
from transformers import BertTokenizer, TFBertForSequenceClassification
from transformers import InputExample, InputFeatures
model = TFBertForSequenceClassification.from_pretrained("bert-base-uncased")
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")
# We have the main BERT model, a dropout layer to prevent overfitting, and finally a dense layer for classification task:
model.summary()
# We have two pandas Dataframe objects waiting for us to convert them into suitable objects for the BERT model.
# We will take advantage of the InputExample function that helps us to create sequences from our dataset.
# The InputExample function can be called as follows:
InputExample(guid=None,
text_a = "Hello, world",
text_b = None,
label = 1)
Now we will create two main functions:
def convert_data_to_examples(train, test, clean_headline, is_sarcastic):
train_InputExamples = train.apply(lambda x: InputExample(guid=None, # Globally unique ID for bookkeeping, unused in this case
text_a = x[clean_headline],
text_b = None,
label = x[is_sarcastic]), axis = 1)
validation_InputExamples = test.apply(lambda x: InputExample(guid=None, # Globally unique ID for bookkeeping, unused in this case
text_a = x[clean_headline],
text_b = None,
label = x[is_sarcastic]), axis = 1)
return train_InputExamples, validation_InputExamples
train_InputExamples, validation_InputExamples = convert_data_to_examples(train,
test,
'clean_headline',
'is_sarcastic')
def convert_examples_to_tf_dataset(examples, tokenizer, max_length=128):
features = [] # -> will hold InputFeatures to be converted later
for e in examples:
# Documentation is really strong for this method, so please take a look at it
input_dict = tokenizer.encode_plus(
e.text_a,
add_special_tokens=True,
max_length=max_length, # truncates if len(s) > max_length
return_token_type_ids=True,
return_attention_mask=True,
pad_to_max_length=True, # pads to the right by default # CHECK THIS for pad_to_max_length
truncation=True
)
input_ids, token_type_ids, attention_mask = (input_dict["input_ids"],
input_dict["token_type_ids"], input_dict['attention_mask'])
features.append(
InputFeatures(
input_ids=input_ids, attention_mask=attention_mask, token_type_ids=token_type_ids, label=e.label
)
)
def gen():
for f in features:
yield (
{
"input_ids": f.input_ids,
"attention_mask": f.attention_mask,
"token_type_ids": f.token_type_ids,
},
f.label,
)
return tf.data.Dataset.from_generator(
gen,
({"input_ids": tf.int32, "attention_mask": tf.int32, "token_type_ids": tf.int32}, tf.int64),
(
{
"input_ids": tf.TensorShape([None]),
"attention_mask": tf.TensorShape([None]),
"token_type_ids": tf.TensorShape([None]),
},
tf.TensorShape([]),
),
)
clean_headline = 'clean_headline'
is_sarcastic = 'is_sarcastic'
# Our dataset containing processed input sequences are ready to be fed to the model.
train_InputExamples, validation_InputExamples = convert_data_to_examples(train, test, clean_headline, is_sarcastic)
train_data = convert_examples_to_tf_dataset(list(train_InputExamples), tokenizer)
train_data = train_data.shuffle(100).batch(32).repeat(2)
validation_data = convert_examples_to_tf_dataset(list(validation_InputExamples), tokenizer)
validation_data = validation_data.batch(32)
# We will use Adam as our optimizer, CategoricalCrossentropy as our loss function, and SparseCategoricalAccuracy as our accuracy metric.
# Fine-tuning the model for 2 epochs will give us around 90% accuracy, which is great.
# Training the model might take a while, so ensure you enabled the GPU acceleration from the Notebook Settings.
%%time
model.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=3e-5, epsilon=1e-08, clipnorm=1.0),
loss=tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True),
metrics=[tf.keras.metrics.SparseCategoricalAccuracy('accuracy')])
H = model.fit(train_data, epochs=2, validation_data=validation_data)
# 30 min for maxlen = 128
# Making Predictions
# I created a list of two reviews I created. The first one is a sarcastic review, while the second one is cnot sarcastic.
pred_sentences = ['What planet did you come from?',
'This is really a very beautiful pic']
# We need to tokenize our reviews with our pre-trained BERT tokenizer. We will then feed these tokenized sequences to our model
# and run a final softmax layer to get the predictions. We can then use the argmax function to determine whether our sentiment
# prediction for the review is positive or negative. Finally, we will print out the results with a simple for loop.
# The following lines do all of these said operations:
tf_batch = tokenizer(pred_sentences, max_length=128, padding=True, truncation=True, return_tensors='tf')
tf_outputs = model(tf_batch)
tf_predictions = tf.nn.softmax(tf_outputs[0], axis=-1)
labels = ['0','1']
label = tf.argmax(tf_predictions, axis=1)
label = label.numpy()
for i in range(len(pred_sentences)):
print(pred_sentences[i], ": \n", labels[label[i]])
# Using the BERT on 5 test samples
predict_set = test[0:5]
pred_sentences = list(predict_set['clean_headline'])
tf_batch = tokenizer(pred_sentences, max_length=128, padding=True, truncation=True, return_tensors='tf')
tf_outputs = model(tf_batch)
tf_predictions = tf.nn.softmax(tf_outputs[0], axis=-1)
labels = ['0','1']
label = tf.argmax(tf_predictions, axis=1)
label = label.numpy()
for i in range(len(pred_sentences)):
print(pred_sentences[i], ": \n", labels[label[i]])